/**********************************************************************/
/*
      Title: clean_udb.do
			Author: Robbie Dulin,Clotaire Boyer
			Created: 18 Dec 2019
    	Description: This file cleans SUSENAS_UDB merge data and outputs
        summary statistics.
*/
/**********************************************************************/


/*
Data Notes:
  1. BPNT var has only 1s; assumed to be 0s for UDB HHs
  2. Made number of ART, number of families, number of rooms into categorical
  3. Winsorized land area variable to 0.5th and 99.5th percentiles
  4. Let number of livestock variables be linear
*/


cap log close
local prefix: display %tdCYND td(`c(current_date)')
log using "$log/`prefix'_udb", replace text



/*----------------------------------------------------*/
                /* Section 0: Setup */
/*----------------------------------------------------*/
version 14.2
clear
set more off

// // NOTE: set udb to location of mounted UDB merge data container
*global udb_sep18                          "/Volumes/UDB18"
*global udb_mar19                          "/Volumes/NO NAME"


/*----------------------------------------------------*/
        /* Section 2: Clean Sep 18 UDB Merge */
/*----------------------------------------------------*/

u "$udb_sep18/SSN_BDT_KOR1809_RT_URUT_FULL.dta", clear
ds
// drop SUSENAS variables
count if idbdt != ""
di _N
// NOTE: 25,906 / 74,019 HHs present in Sep 18 merge
// subdistrict and village codes only available for matched HHs but are available in separate Sep 18 data set

keep fwt-match_rt m101-m105
quietly ds
di "`r(varlist)'"
count if idbdt != ""

// Variable: In UDB dummy
gen udb = idbdt != ""
tab udb
tab sta_keberadaan_rt
recode sta_keberadaan_rt (4 6 = .)

*** Set missing to 0, create missing indicator
foreach var of varlist adapkh adakks2016 adakks2017 adapbi adadapodik sta_keberadaan_rt flag_ada_di_pbdt15 jumlah_sapi jumlah_kerbau jumlah_kuda jumlah_babi jumlah_kambing percentile {
  tab `var'
  // create dummy if var missing
  gen `var'_m_udb = `var' == .

  // create copy of var where . is set to 0
  recode `var' (. = 0), gen(`var'_udb)

  tab `var'_udb `var'_m_udb
}


** BPNT eligibility var
// NOTE: adabpnt has no 0 for ineligible UDB HHs; I assume all UDB HHs that are missing are 0s
tab adabpnt
gen adabpnt_udb = adabpnt

// replace missing with 0s
replace adabpnt_udb = 0 if adabpnt == .

// then create missing var
gen adabpnt_m_udb = adabpnt_udb == 0 & udb == 0
tab adabpnt_udb adabpnt_m_udb

// rename to shorten
rename cara_peroleh_airminum caraperolehairmin
rename sumber_penerangan sumberpen

// create dummies categories of categorical vars (plus missing)
foreach var of varlist sta_bangunan sta_lahan lantai dinding atap sumber_airminum caraperolehairmin sumberpen daya bb_masak fasbab kloset buang_tinja {
  // create indicators for each category
  tab `var', gen(`var')

  // set missing in each indicator to 0
  foreach dummy of varlist `var'?* {
    recode `dummy' (. = 0)
    rename `dummy' `dummy'_udb
  }

  // create missing indicator
  gen `var'_m_udb = `var' == .

  // test that all dummies are exhaustive and mutually exclusive
  gen test = 0
  foreach testvar of varlist `var'*udb {
    quietly replace test = test + `testvar'
  }
  assert test == 1
  drop test
}

*** Clean 1 = 3 = yes; 2 = 4 = no variables
* fix unexplained values (do not correspond to survey) in miscellaneous variables
recode aset_tak_bergerak (61 28 = .)

// NOTE: According to the survey, these vars are either 1 = yes, 2 = no; or 3=yes, 4=no; some have all four values mixed in
// I recode assuming all 1s and 3s are yes, and all 2s and 4s are no
* Recode yes/no variables
foreach var of varlist kondisi_dinding kondisi_atap ada_tabung_gas ada_ac ada_telepon ada_emas ada_sepeda ada_motor_tempel sta_art_usaha ///
  sta_kks sta_kis sta_pkh sta_jamsostek ada_lemari_es ada_pemanas ada_tv ada_laptop ada_motor ada_mobil ada_perahu ada_perahu_motor ///
  ada_kapal sta_kip sta_bpjs_mandiri sta_asuransi sta_rastra sta_kur aset_tak_bergerak rumah_lain {
    // NOTE: some vars have 0s when questionnaire indicates answers should only be 1, 2, 3, or 4; I treat 0 as missing
    replace `var' = . if `var' == 0
    assert inlist(`var', ., 1, 2, 3, 4)

    // recode so 3 => 1 = yes; 2=4 => 0 no
    recode `var' (3 = 1) (4 2 = 0) (. = 0), gen(`var'_udb)

    // create missing dummy
    gen `var'_m_udb = `var' == .
    tab `var'_udb `var'_m_udb
    }

*** Leftover variables
/*
jumlah_art jumlah_keluarga luas_lantai jumlah_kamar nomor_gas aset_tak_bergerak luas_atb rumah_lain jumlah_sapi jumlah_kerbau jumlah_kuda
jumlah_babi jumlah_kambing sta_kur sta_keberadaan_rt percentile id_pengurus flag_ada_di_pbdt15
*/

** jumlah_art: number of HH members
// create dummies, treating as categorical variable to allow it to enter flexibly (10+ combined)
tab jumlah_art, gen(jumlah_art)

// Create indicator for 10+ category
gen jumlah_art10up_udb = jumlah_art10 == 1 | jumlah_art11 == 1 | jumlah_art12 == 1 | jumlah_art13 == 1 | jumlah_art14 == 1 | jumlah_art15 == 1 | ///
  jumlah_art16 == 1 | jumlah_art17 == 1 | jumlah_art18 == 1 | jumlah_art19 == 1
tab jumlah_art10up_udb

// set missings to 0
forvalues i  = 1 / 9 {
  recode jumlah_art`i' (. = 0)
  rename jumlah_art`i' jumlah_art`i'_udb
}

// create missing indicator
gen jumlah_art_m_udb = jumlah_art == .


** jumlah_keluarga: number of families in HH
// by my reading, 0 families should be impossible (every individual belongs to a family); I recode to 0
replace jumlah_keluarga = . if jumlah_keluarga == 0

// create dummies
tab jumlah_keluarga, gen(jumlah_keluarga)

// create indicator for 4+ category
gen jumlah_keluarga4up_udb = jumlah_keluarga4 == 1 | jumlah_keluarga5 == 1 | jumlah_keluarga6 == 1 | jumlah_keluarga7 == 1 | jumlah_keluarga8 == 1

// set missings to 0
forvalues i  = 1 / 3 {
  recode jumlah_keluarga`i' (. = 0)
  rename jumlah_keluarga`i' jumlah_keluarga`i'_udb
}

// create missing indicator
gen jumlah_keluarga_m_udb = jumlah_keluarga == .
tab jumlah_keluarga_m_udb


** luas_lantai: floor area
destring luas_lantai, replace
summ luas_lantai
recode luas_lantai (. = 0), gen(luas_lantai_udb)

// missing indicator
gen luas_lantai_m_udb = luas_lantai == .


** jumlah_kamar: number of rooms
// create dummies, treating as categorical variable (4+ combined)
tab jumlah_kamar, gen(jumlah_kamar)

// rename dummies to match values
forval i = 1 / 15 {
  local j = `i' - 1
  rename jumlah_kamar`i' jumlah_kamar`j'
}

// create 4+ category
gen jumlah_kamar4up_udb = jumlah_kamar4 == 1| jumlah_kamar5 == 1 | jumlah_kamar6 == 1 | jumlah_kamar7 == 1 | jumlah_kamar8 == 1 | jumlah_kamar9 == 1 | ///
  jumlah_kamar10 == 1 | jumlah_kamar11 == 1 | jumlah_kamar12 == 1 | jumlah_kamar13 == 1 | jumlah_kamar14 == 1
tab jumlah_kamar4up_udb

// recode missings to 0
forval i = 0 / 3 {
  recode jumlah_kamar`i' (. = 0)
  rename jumlah_kamar`i' jumlah_kamar`i'_udb
}

// create missing indicator
gen jumlah_kamar_m_udb = jumlah_kamar == .

summ jumlah_kamar*udb


** luas_atb: land area
replace luas_atb = "" if luas_atb == "-"
destring luas_atb, replace

// examine outliers in luas_atb
summ luas_atb, d
gen luas_atb_z = (luas_atb - `r(mean)') / `r(sd)'
summ luas_atb_z
cap noi extremes luas_atb luas_atb_z, high n(25)

// winsorize luas_atb
gen luas_atb_udb = luas_atb
_pctile luas_atb, p(99.5)
replace luas_atb_udb = `r(r1)' if luas_atb_udb > `r(r1)' & !missing(luas_atb_udb)
summ luas_atb_udb

// Create missing luas_atb variable
gen luas_atb_m_udb = luas_atb == .
tab luas_atb_m_udb

// Replace . = 0
replace luas_atb_udb = 0 if luas_atb_udb == .

*** Keep lasso-prepped vars, save dataset
keep urut2 *udb

tempfile sep18_rt
save `sep18_rt'


*** Create Head of HH variables from individual survey
u "$udb_sep18/SSN_BDT_KOR1809_ID_FULL.dta", clear
keep fwt-source_name
ds

* Head of HH
gen head_hh = hub_krt == 1
tab head_hh

* Age of head of HH
gen age = umur if head_hh == 1
summ age

* keep only head of HH observations
// some HHs report more than one HoH for some reason
// when this is the case, I make the oldest one the head of household
bysort urut2: egen head_hh_check = total(head_hh)
count if head_hh_check > 1

bysort urut2 (age): gen head_hh_fix = _n == _N if head_hh == 1
tab head_hh_fix head_hh
replace head_hh = head_hh_fix if head_hh_check > 1
tab head_hh_fix head_hh

keep if head_hh == 1

* Male head of HH
gen male = jnskel == 1
replace male = . if jnskel == .

* HoH marital status
tab sta_kawin, gen(sta_kawin)

* education level of edu reached
tab pendidikan_tertinggi
recode pendidikan_tertinggi (99 = .)
tab pendidikan_tertinggi, gen(pendidikan_tertinggi)
summ pendidikan_tertinggi?*

* highest grade ever reached
tab kelas_tertinggi, gen(kelas_tertinggi)

* highest diploma received
tab ijazah_tertinggi, gen(ijazah_tertinggi)

* work status
tab sta_bekerja
gen worked = sta_bekerja == 1
replace worked = . if sta_bekerja == .

* hours worked
summ jumlah_jamkerja

* field of business
tab lapangan_usaha
recode lapangan_usaha (0 = .)
tab lapangan_usaha, gen(lapangan_usaha)

* job status
tab status_pekerjaan
recode status_pekerjaan (0 = .)
tab status_pekerjaan, gen(status_pekerjaan)

keep urut2 status_pekerjaan?* lapangan_usaha? lapangan_usaha?? worked jumlah_jamkerja ijazah_tertinggi? kelas_tertinggi? ///
  pendidikan_tertinggi? pendidikan_tertinggi?? sta_kawin male age
rename * *_hoh_udb
rename urut2_hoh_udb urut2

tempfile head_hh_sep18
save `head_hh_sep18'


*** Create overall household variables from individual survey
u "$udb_sep18/SSN_BDT_KOR1809_ID_FULL.dta", clear
keep fwt-source_name
ds
gen udb = idbdt != ""
keep if udb == 1

* pregnancy status
tab sta_hamil
gen pregnant = sta_hamil == 1
gen pregnant_m = sta_hamil == .

* disability type
tab jenis_cacat
gen disability = 1 <= jenis_cacat & jenis_cacat <= 12
gen disability_m = jenis_cacat == .

* chronic disease
tab penyakit_kronis
gen chronic_disease = 1 <= penyakit_kronis & penyakit_kronis <= 9
gen chronic_disease_m = penyakit_kronis == .

* number in school
gen in_school = partisipasi_sekolah == 1
gen in_school_m = partisipasi_sekolah == .

* highest level of education completed
tab pendidikan_tertinggi
recode pendidikan_tertinggi (99 = .)

* highest grade ever reached
tab kelas_tertinggi

* highest degree
tab ijazah_tertinggi

* worked in past week
tab sta_bekerja
gen work = sta_bekerja
recode work (2 = 0)
gen work_m = sta_bekerja == .

* number in HH
preserve
bysort urut2: gen num_in_hh = _N
summ num_in_hh
keep urut2 num_in_hh
duplicates drop
di _N
tempfile num_hh
save `num_hh'
restore

// merge in number in HH
merge m:1 urut2 using `num_hh'
assert _m != 2
drop _m

tab num_in_hh, gen(num_in_hh)

// keep vars
keep urut2 pregnant disability chronic_disease in_school work pendidikan_tertinggi kelas_tertinggi ijazah_tertinggi num_in_hh? num_in_hh?? *_m


// collapse to HH level
collapse (sum) pregnant disability chronic_disease in_school work (max) max_edu_in_hh = pendidikan_tertinggi max_grade_in_hh = kelas_tertinggi ///
  max_degree_in_hh = ijazah_tertinggi (mean) num_in_hh? num_in_hh?? *_m, by(urut2)
summ
di _N
* Create education variables categories
tab max_edu_in_hh, gen(max_edu_in_hh)
tab max_grade_in_hh, gen(max_grade_in_hh)
tab max_degree_in_hh, gen(max_degree_in_hh)

* Create 1+ in HH dummy
foreach var of varlist pregnant disability chronic_disease in_school work {
  gen `var'_hh = `var' >= 1
  // replace = missing if all obs within HH are missing
  replace `var'_hh = . if `var'_m == 1
  tab `var'_hh
}

keep urut2 pregnant_hh disability_hh chronic_disease_hh in_school_hh work_hh max_edu_in_hh? max_edu_in_hh?? max_grade_in_hh? max_degree_in_hh? num_in_hh? num_in_hh??
rename * *_udb
rename urut2_udb urut2

tempfile overall_hh_sep18
save `overall_hh_sep18'

** Merge in vars from individual datasets
u `sep18_rt', clear

* merge in Head of HH vars
merge 1:1 urut2 using `head_hh_sep18'
assert _m != 2
drop _m

// create missing indicators
foreach stub in status_pekerjaan lapangan_usaha ijazah_tertinggi kelas_tertinggi pendidikan_tertinggi {
  gen `stub'_hoh_m_udb = `stub'1_hoh_udb == .
  summ `stub'_hoh_m_udb
}

foreach stub in worked jumlah_jamkerja sta_kawin male age {
  gen `stub'_hoh_m_udb = `stub'_hoh_udb == .
  summ `stub'_hoh_m_udb
}

// replace missings with 0s
foreach var of varlist status_pekerjaan?_hoh_udb lapangan_usaha?_hoh_udb lapangan_usaha??_hoh_udb ijazah_tertinggi?_hoh_udb kelas_tertinggi?_hoh_udb ///
  pendidikan_tertinggi?_hoh_udb pendidikan_tertinggi??_hoh_udb worked_hoh_udb jumlah_jamkerja_hoh_udb sta_kawin_hoh_udb male_hoh_udb age_hoh_udb {
    recode `var' (. = 0)
  }

* merge in overall HH vars
merge 1:1 urut2 using `overall_hh_sep18'
assert _m != 2
drop _m

// create missing indicators
foreach stub in pregnant_hh disability_hh chronic_disease_hh in_school_hh work_hh {
  gen `stub'_m_udb = `stub'_udb == .
  summ `stub'_m_udb
}

foreach stub in max_edu_in_hh max_grade_in_hh max_degree_in_hh num_in_hh {
  gen `stub'_m_udb = `stub'1_udb == .
  summ `stub'_m_udb
}

// replace missings with 0s
foreach var of varlist pregnant_hh_udb disability_hh_udb chronic_disease_hh_udb in_school_hh_udb work_hh_udb max_edu_in_hh?_udb max_edu_in_hh??_udb ///
  max_grade_in_hh?_udb max_degree_in_hh?_udb num_in_hh?_udb num_in_hh??_udb {
    recode `var' (. = 0)
  }

// assert no missings
foreach var of varlist *udb {
  assert `var' != .
}

ds

save "$cleaned/sep_2018_udb_lasso_pool.dta", replace
di c(k)
// 336 vars

/*----------------------------------------------------*/
        /* Section 3: Clean Mar 19 UDB Merge */
/*----------------------------------------------------*/

u "$udb_mar19/SSN_BDT_KOR1903_RT_URUT_FULL.dta", clear
count if idbdt != ""
di _N
// NOTE: 111,671 / 315,672 HHs are matched
// Individual dataset has village and kec codes
drop r101-exp_cap
ds

// get kec and village codes from individual dataset
preserve
u "$udb_mar19/SSN_BDT_KOR1903_ID_FULL.dta", clear
count if r103 == .
count if r104 == .
summ renum
duplicates drop renum, force
di _N
keep renum r103 r104
tempfile kec_vil_codes
save `kec_vil_codes'
restore

// merge in kec and village codes
merge 1:1 renum using `kec_vil_codes', assert(3)
drop _m

quietly ds
di "`r(varlist)'"


// Variable: In UDB dummy
tab match_rt
gen udb = match_rt == 1
tab udb

// NOTE: This var is not found in survey, I assume it is a 0/1 variable so I recode other values to .
// sta_keberadaan_rt: Existence of HH members outside the household
tab sta_keberadaan_rt
recode sta_keberadaan_rt (4 6 = .)

*** Set missing to 0, create missing indicator
foreach var of varlist adapkh adakks2016 adakks2017 adapbi adadapodik sta_keberadaan_rt flag_ada_di_pbdt15 jumlah_sapi jumlah_kerbau jumlah_kuda jumlah_babi jumlah_kambing percentile {
  tab `var'
  // create dummy if var missing
  gen `var'_m_udb = `var' == .

  // create copy of var where . is set to 0
  recode `var' (. = 0), gen(`var'_udb)

  tab `var'_udb `var'_m_udb
}

** BPNT eligibility var
// NOTE: adabpnt has no 0 for ineligible UDB HHs; I assume all UDB HHs that are missing are 0s
tab adabpnt

// replace missing with 0s
recode adabpnt (. = 0), gen(adabpnt_udb)

// then create missing var
gen adabpnt_m_udb = adabpnt_udb == 0 & udb == 0
tab adabpnt_udb adabpnt_m_udb

// rename to shorten
rename cara_peroleh_airminum caraperolehairmin
rename sumber_penerangan sumberpen

// create dummies categories of categorical vars (plus missing)
foreach var of varlist sta_bangunan sta_lahan lantai dinding atap sumber_airminum caraperolehairmin sumberpen daya bb_masak fasbab kloset buang_tinja {
  // create indicators for each category
  tab `var', gen(`var')

  // set missing in each indicator to 0
  foreach dummy of varlist `var'?* {
    recode `dummy' (. = 0)
    rename `dummy' `dummy'_udb
  }

  // create missing indicator
  gen `var'_m_udb = `var' == .

  // test that all dummies are exhaustive and mutually exclusive
  gen test = 0
  foreach testvar of varlist `var'*udb {
    quietly replace test = test + `testvar'
  }
  assert test == 1
  drop test
}


*** Clean 1 = 3 = yes; 2 = 4 = no variables
* fix unexplained values (do not correspond to survey) in miscellaneous variables
tab aset_tak_bergerak
recode aset_tak_bergerak (19 14 7 6 = .)

// I recode assuming all 1s and 3s are yes, and all 2s and 4s are no
foreach var of varlist kondisi_dinding kondisi_atap ada_tabung_gas ada_ac ada_telepon ada_emas ada_sepeda ada_motor_tempel sta_art_usaha ///
  sta_kks sta_kis sta_pkh sta_jamsostek ada_lemari_es ada_pemanas ada_tv ada_laptop ada_motor ada_mobil ada_perahu ada_perahu_motor ///
  ada_kapal sta_kip sta_bpjs_mandiri sta_asuransi sta_rastra aset_tak_bergerak rumah_lain sta_kur {
    assert inlist(`var', ., 0, 1, 2, 3, 4)

    // NOTE: some vars have 0s when questionnaire indicates answers should only be 1, 2, 3, or 4; I treat 0 as missing
    replace `var' = . if `var' == 0

    // recode so 3 => 1 = yes; 2=4 => 0 no
    recode `var' (3 = 1) (4 2 = 0) (. = 0), gen (`var'_udb)

    // create missing dummy
    gen `var'_m_udb = `var' == .
    tab `var'_udb `var'_m_udb
  }

/*
   nomor_meter_air     nomor_gas
       id_pengurus  match_rt r103 r104
*/

** jumlah_art: number of HH members
// create dummies, treating as categorical variable to allow it to enter flexibly (10+ combined)
tab jumlah_art, gen(jumlah_art)

// Create indicator for 10+ category
gen jumlah_art10up_udb = 10 <= jumlah_art & jumlah_art <= 34
tab jumlah_art10up_udb

// set missings to 0
forvalues i  = 1 / 9 {
  recode jumlah_art`i' (. = 0)
  rename jumlah_art`i' jumlah_art`i'_udb
}

// create missing indicator
gen jumlah_art_m_udb = jumlah_art == .


** jumlah_keluarga: number of families in HH
// by my reading, 0 families should be impossible (every individual belongs to a family); I recode to 0
replace jumlah_keluarga = . if jumlah_keluarga == 0

// create dummies
tab jumlah_keluarga, gen(jumlah_keluarga)

// create indicator for 4+ category
gen jumlah_keluarga4up_udb = jumlah_keluarga >= 4 & jumlah_keluarga != .
tab jumlah_keluarga4up_udb

// set missings to 0
forvalues i  = 1 / 3 {
  recode jumlah_keluarga`i' (. = 0)
  rename jumlah_keluarga`i' jumlah_keluarga`i'_udb
}

// create missing indicator
gen jumlah_keluarga_m_udb = jumlah_keluarga == .
tab jumlah_keluarga_m_udb

summ jumlah_keluarga*udb

** luas_lantai: floor area
destring luas_lantai, replace
summ luas_lantai
recode luas_lantai (. = 0), gen(luas_lantai_udb)

// missing indicator
gen luas_lantai_m_udb = luas_lantai == .

** jumlah_kamar: number of rooms
// create dummies, treating as categorical variable (4+ combined)
tab jumlah_kamar, gen(jumlah_kamar)

// rename dummies to match values
forval i = 1 / `r(r)' {
  local j = `i' - 1
  rename jumlah_kamar`i' jumlah_kamar`j'
}

// create 4+ category
gen jumlah_kamar4up_udb = jumlah_kamar >= 4 & jumlah_kamar != .
tab jumlah_kamar4up_udb

// recode missings to 0
forval i = 0 / 3 {
  recode jumlah_kamar`i' (. = 0)
  rename jumlah_kamar`i' jumlah_kamar`i'_udb
}

// create missing indicator
gen jumlah_kamar_m_udb = jumlah_kamar == .

summ jumlah_kamar*udb


** luas_atb: land area
replace luas_atb = "" if luas_atb == "-"
destring luas_atb, replace

// examine outliers in luas_atb
summ luas_atb, d
gen luas_atb_z = (luas_atb - `r(mean)') / `r(sd)'
summ luas_atb_z
cap noi extremes luas_atb luas_atb_z, high n(25)

// winsorize luas_atb
gen luas_atb_udb = luas_atb
_pctile luas_atb, p(99.5)
replace luas_atb_udb = `r(r1)' if luas_atb_udb > `r(r1)' & !missing(luas_atb_udb)
summ luas_atb_udb

// Create missing luas_atb variable
gen luas_atb_m_udb = luas_atb == .
tab luas_atb_m_udb

// Replace . = 0
replace luas_atb_udb = 0 if luas_atb_udb == .

*** Keep lasso-prepped vars and save
keep renum r103 r104 *udb

// make sure that no missing values
foreach var of varlist *udb {
  assert `var' != .
}

tempfile mar_2019_rt
save `mar_2019_rt'

*** Create Head of HH variables from individual survey
u "$udb_mar19/SSN_BDT_KOR1903_ID_FULL", clear
keep fwt-source_name
ds

* Head of HH
gen head_hh = hub_krt == 1
tab head_hh

* Age of head of HH
gen age = umur if head_hh == 1
summ age
// _pctile age, p(0.5 99.5)
// replace age = `r(r1)' if age < `r(r1)'
// replace age = `r(r2)' if age > `r(r2)' & !missing(age)
// summ age

* keep only head of HH observations
// some HHs report more than one HoH for some reason
// when this is the case, I make the oldest one the head of household
bysort renum: egen head_hh_check = total(head_hh)
count if head_hh_check > 1

bysort renum (age): gen head_hh_fix = _n == _N if head_hh == 1
tab head_hh_fix head_hh
replace head_hh = head_hh_fix if head_hh_check > 1
tab head_hh_fix head_hh

keep if head_hh == 1

* Male head of HH
gen male = jnskel == 1
replace male = . if jnskel == .

* HoH marital status
tab sta_kawin, gen(sta_kawin)

* education level of edu reached
tab pendidikan_tertinggi
recode pendidikan_tertinggi (99 = .)
tab pendidikan_tertinggi, gen(pendidikan_tertinggi)
summ pendidikan_tertinggi?*

* highest grade ever reached
tab kelas_tertinggi, gen(kelas_tertinggi)

* highest diploma received
tab ijazah_tertinggi, gen(ijazah_tertinggi)

* work status
tab sta_bekerja
gen worked = sta_bekerja == 1
replace worked = . if sta_bekerja == .

* hours worked
summ jumlah_jamkerja
recode jumlah_jamkerja (-1 = .)

* field of business
tab lapangan_usaha
recode lapangan_usaha (0 = .)
tab lapangan_usaha, gen(lapangan_usaha)

* job status
tab status_pekerjaan
recode status_pekerjaan (0 = .)
tab status_pekerjaan, gen(status_pekerjaan)

keep renum status_pekerjaan?* lapangan_usaha? lapangan_usaha?? worked jumlah_jamkerja ijazah_tertinggi? kelas_tertinggi? ///
  pendidikan_tertinggi? pendidikan_tertinggi?? sta_kawin male age
rename * *_hoh_udb
rename renum_hoh_udb renum

tempfile head_hh_mar19
save `head_hh_mar19'


*** Create overall household variables from individual survey
u "$udb_mar19/SSN_BDT_KOR1903_ID_FULL", clear
keep fwt-source_name
gen udb = idbdt != ""
keep if udb == 1

* pregnancy status
tab sta_hamil
gen pregnant = sta_hamil == 1
gen pregnant_m = sta_hamil == .

* disability type
tab jenis_cacat
gen disability = 1 <= jenis_cacat & jenis_cacat <= 12
gen disability_m = jenis_cacat == .

* chronic disease
tab penyakit_kronis
gen chronic_disease = 1 <= penyakit_kronis & penyakit_kronis <= 9
gen chronic_disease_m = penyakit_kronis == .

* number in school
gen in_school = partisipasi_sekolah == 1
gen in_school_m = partisipasi_sekolah == .

* highest level of education completed
tab pendidikan_tertinggi
recode pendidikan_tertinggi (99 = .)

* highest grade ever reached
tab kelas_tertinggi

* highest degree
tab ijazah_tertinggi

* worked in past week
tab sta_bekerja
gen work = sta_bekerja
recode work (2 = 0)
gen work_m = sta_bekerja == .

* number in HH
preserve
bysort renum: gen num_in_hh = _N
summ num_in_hh
keep renum num_in_hh
duplicates drop
di _N
tempfile num_hh
save `num_hh'
restore

// merge in number in HH
merge m:1 renum using `num_hh'
assert _m != 2
drop _m

tab num_in_hh, gen(num_in_hh)

// keep vars
keep renum pregnant disability chronic_disease in_school work pendidikan_tertinggi kelas_tertinggi ijazah_tertinggi num_in_hh? num_in_hh?? *_m


// collapse to HH level
collapse (sum) pregnant disability chronic_disease in_school work (max) max_edu_in_hh = pendidikan_tertinggi max_grade_in_hh = kelas_tertinggi ///
  max_degree_in_hh = ijazah_tertinggi (mean) num_in_hh? num_in_hh?? *_m, by(renum)
summ

di _N
* Create education variables categories
tab max_edu_in_hh, gen(max_edu_in_hh)
tab max_grade_in_hh, gen(max_grade_in_hh)
tab max_degree_in_hh, gen(max_degree_in_hh)

* Create 1+ in HH dummy
foreach var of varlist pregnant disability chronic_disease in_school work {
  gen `var'_hh = `var' >= 1

  // replace = missing if all obs within HH are missing
  replace `var'_hh = . if `var'_m == 1
  tab `var'_hh
}

keep renum pregnant_hh disability_hh chronic_disease_hh in_school_hh work_hh max_edu_in_hh? max_edu_in_hh?? max_grade_in_hh? max_degree_in_hh? num_in_hh? num_in_hh??
rename * *_udb
rename renum_udb renum

tempfile overall_hh_mar19
save `overall_hh_mar19'

** Merge in vars from individual datasets
u `mar_2019_rt', clear

* merge in Head of HH vars
merge 1:1 renum using `head_hh_mar19'
assert _m != 2
drop _m

// create missing indicators
foreach stub in status_pekerjaan lapangan_usaha ijazah_tertinggi kelas_tertinggi pendidikan_tertinggi {
  gen `stub'_hoh_m_udb = `stub'1_hoh_udb == .
  summ `stub'_hoh_m_udb
}

foreach stub in worked jumlah_jamkerja sta_kawin male age {
  gen `stub'_hoh_m_udb = `stub'_hoh_udb == .
  summ `stub'_hoh_m_udb
}

// replace missings with 0s
foreach var of varlist status_pekerjaan?_hoh_udb lapangan_usaha?_hoh_udb lapangan_usaha??_hoh_udb ijazah_tertinggi?_hoh_udb kelas_tertinggi?_hoh_udb ///
  pendidikan_tertinggi?_hoh_udb pendidikan_tertinggi??_hoh_udb worked_hoh_udb jumlah_jamkerja_hoh_udb sta_kawin_hoh_udb male_hoh_udb age_hoh_udb {
    recode `var' (. = 0)
  }

* merge in overall HH vars
merge 1:1 renum using `overall_hh_mar19'
assert _m != 2
drop _m

// create missing indicators
foreach stub in pregnant_hh disability_hh chronic_disease_hh in_school_hh work_hh {
  gen `stub'_m_udb = `stub'_udb == .
  summ `stub'_m_udb
}

foreach stub in max_edu_in_hh max_grade_in_hh max_degree_in_hh num_in_hh {
  gen `stub'_m_udb = `stub'1_udb == .
  summ `stub'_m_udb
}

// replace missings with 0s
foreach var of varlist pregnant_hh_udb disability_hh_udb chronic_disease_hh_udb in_school_hh_udb work_hh_udb max_edu_in_hh?_udb max_edu_in_hh??_udb ///
  max_grade_in_hh?_udb max_degree_in_hh?_udb num_in_hh?_udb num_in_hh??_udb {
    recode `var' (. = 0)
  }

// assert no missings
foreach var of varlist *udb {
  assert `var' != .
}

di c(k)
// 352 variables
save "$cleaned/mar_2019_udb_lasso_pool.dta", replace

cap log close
